{"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"colab":{"name":"terms_extraction.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true}},"cells":[{"cell_type":"markdown","metadata":{"id":"jgepWaeGYriv","colab_type":"text"},"source":["# Extraction of all tokens for all books in the science fiction and the random corpora.\n","\n","Code for extracting tokens from the corpora using the HTRC Extracted Features files and the HTRC Feature Reader"]},{"cell_type":"markdown","metadata":{"id":"Y0DLOZ8IbdKh","colab_type":"text"},"source":["### 0. Download packages (required on Google Colab)"]},{"cell_type":"code","metadata":{"id":"eXPSAm5wGlkD","colab_type":"code","colab":{}},"source":["from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"_Thp_Z90bJBf","colab_type":"code","colab":{}},"source":["!pip install htrc-feature-reader"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"81XCXBU2bfCA","colab_type":"code","colab":{}},"source":["import nltk\n","nltk.download('wordnet')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"SJ1CzTY521um","colab_type":"text"},"source":["### 1. Extract htids for both the scifi and random corpus"]},{"cell_type":"code","metadata":{"id":"UTwml9d5Yri5","colab_type":"code","colab":{}},"source":["import pandas as pd\n","import os\n","import sys\n","import glob\n","import gzip\n","from   htrc_features import FeatureReader, utils as frutils\n","from   nltk.stem import WordNetLemmatizer\n","import time\n","import csv\n","\n","# Directories for input and output\n","figDir = 'figures'\n","resultsDir = 'results'\n","inputDir = 'inputs'\n","\n","# Full corpus data can be large; make it easy to stash outside GitHub/Google\n","bigDir = '.' # Base directory for large files\n","htrcefDir = os.path.join(bigDir, 'htrcef') # HTRC-EF JSONs"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"q1zPP-tcYrjF","colab_type":"code","colab":{}},"source":["def creating_htid_list(csv_path):\n","\n","    with open(csv_path, 'r', encoding='utf-8') as csv_file:\n","        dict_csv = csv.DictReader(csv_file)\n","        list_htids = [row[\"htid\"] for row in dict_csv]\n","\n","        return list_htids"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"tOlNsxS9YrjM","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574080692762,"user_tz":-60,"elapsed":650,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"ea47a1a6-9dc2-42e8-a692-79c785db2c2c"},"source":["# Extract htids for the scifi corpus\n","scifi_htids = creating_htid_list(\"scifi_metadata_htids.csv\")\n","print(len(scifi_htids))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["331\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"WRtDhSRZYrjX","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574080694311,"user_tz":-60,"elapsed":532,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"4b309a07-4468-4cd3-94d4-702a93d6b3ae"},"source":["# Extract htids for the random corpus\n","random_htids = creating_htid_list(\"random_metadata_htids.csv\")\n","print(len(random_htids))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["15874\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"SkNXMKkYYrjf","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574073358730,"user_tz":-60,"elapsed":2380,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"65192671-5159-4315-9229-7a50205b99e6"},"source":["# Download the extracted features files for all volumes in the corpus\n","frutils.download_file(htids=scifi_htids, outdir=htrcefDir)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0, None)"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"BMnZpDowYrjq","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574073642884,"user_tz":-60,"elapsed":282491,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"59e0a819-8136-45a6-ce1c-c94e62ce0341"},"source":["# Download the extracted features files for all volumes in the corpus\n","frutils.download_file(htids=random_htids, outdir=htrcefDir)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(0, None)"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"markdown","metadata":{"id":"z4hEroa-Yrj2","colab_type":"text"},"source":["### 2. Preparation to clean the tokens: removal of stopwords, punctuation, lemmatization"]},{"cell_type":"code","metadata":{"id":"5CSSeLsGYrj5","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574080697985,"user_tz":-60,"elapsed":721,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"857374ae-715e-44f5-f816-445a37b350d0"},"source":["# Prepare list of stop words\n","stoplist_file = 'stopwords-underwood-goldstone.txt'\n","stoplist = [line.strip() for line in open(stoplist_file)]\n","stoplist = set(stoplist)\n","print(\"Words in stoplist:\", len(stoplist))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Words in stoplist: 6048\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"TxCIf_z5YrkB","colab_type":"code","colab":{}},"source":["# Functions to work with EF volumes\n","def encode_volid(volid, direction='path'):\n","    '''\n","    Transform htid into filename encoded version and vice versa\n","    '''\n","    encoding_fixes = {'+':':', '=':'/'}\n","    if direction=='path':\n","        encoding_fixes = {v:k for k,v in encoding_fixes.items()}\n","    for key in encoding_fixes:\n","        volid = volid.replace(key, encoding_fixes[key])\n","    return(volid)\n","\n","\n","# Penn treebank tags to keep\n","pos_to_include = [\n","    'FW',  # foreign\n","    'JJ',  # adjectives\n","    'JJR',\n","    'JJS',\n","    'MD',  # modal\n","    'NN',  # nouns (not proper)\n","    'NNS',\n","    'RB',  # adverbs\n","    'RBR',\n","    'RBS',\n","    'VB',  # verbs\n","    'VBD',\n","    'VBG',\n","    'VBN',\n","    'VBP',\n","    'VBZ'\n","]\n","\n","\n","# Translate Penn->WordNet PoS tags\n","#  Need WordNet PoS tags for lemmatizer\n","def get_wordnet_pos(treebank_tag):\n","    from nltk.corpus import wordnet\n","    if treebank_tag.startswith('J'):\n","        return wordnet.ADJ\n","    elif treebank_tag.startswith('V'):\n","        return wordnet.VERB\n","    elif treebank_tag.startswith('M'):\n","        return wordnet.VERB\n","    elif treebank_tag.startswith('R'):\n","        return wordnet.ADV\n","    else:\n","        return wordnet.NOUN\n","    "],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1V8jMALVYrkI","colab_type":"text"},"source":["### 3. Core functions\n","Extraction of the tokens from the dataframe created by the FeatureReader; cleaning of the tokens; list of lists creation; combination of the two separate lists, one for each corpus;"]},{"cell_type":"code","metadata":{"id":"V6vnKQRoYrkL","colab_type":"code","colab":{}},"source":["#Lemmatize the token and create a lemmas dict to have unique lemmas\n","def dict_extraction(token, lemmas):\n","\n","    word = token.Index[0]\n","    pos = get_wordnet_pos(token.Index[1])\n","    lem_word = lemmatizer.lemmatize(word, pos=pos)\n","    count = token.count\n","    if lem_word not in lemmas.keys():\n","        lemmas[lem_word] = count\n","    else:\n","        lemmas[lem_word] += count\n","            \n","    return lemmas"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"BXlW5kesYrkT","colab_type":"code","colab":{}},"source":["# Extract lemmatized tokens for each volume\n","lemmatizer = WordNetLemmatizer() # Initialize lemmatizer\n","\n","\n","def vol_extraction(volid, corpus):\n","    \n","    vol = FeatureReader(os.path.join(htrcefDir,f'{encode_volid(volid)}.json.bz2')).first()\n","    skip_last = vol.page_count - 10 #calculate total number of pages per book to skip the last ten pages\n","    year = vol.year\n","#Slice dataframe: only the central part of the page, only the mid of the book, only tokens with a valid PoS \n","    vol_df = vol.tokenlist(case=False, section='body').loc[10:skip_last].query('pos in @pos_to_include').groupby(level=[2, 3]).sum()\n","    total_tokens = vol_df[\"count\"].sum() #total number of tokens per book to calculate the occ 100k    \n","#Retrieve each row and only compute the function list_extraction if the token is not a stopword\n","    lemmas = dict()\n","    for token in vol_df.itertuples():\n","        if token.Index[0] not in stoplist and token.count > 1:\n","            lemmas = dict_extraction(token, lemmas)\n","    vol_lemmas = ([key, value, round((value / total_tokens * 100000), 2)] for key, value in lemmas.items())\n","    \n","    return year, vol_lemmas"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Ns5NqI5IYrka","colab_type":"code","colab":{}},"source":["# Extract lemmatized tokens for the entire corpus\n","def corpus_extraction(htids_list, corpus):\n","    \n","    results = dict()\n","    for volid in htids_list:\n","        year, vol_lemmas = vol_extraction(volid, corpus)\n","        if year not in results.keys():\n","          results[year] = dict()\n","          results[year][volid] = list(vol_lemmas)\n","        else:\n","          results[year][volid] = list(vol_lemmas)\n","    \n","    return results"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"M5PcPfBzYrki","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574080928513,"user_tz":-60,"elapsed":210051,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"911cc1ac-5149-44ee-9a99-df91a6baa737"},"source":["start1 = time.perf_counter()\n","scifi_lemmas = corpus_extraction(scifi_htids, \"scifi\")\n","end1 = time.perf_counter()\n","print(end1 - start1)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["208.35502319199986\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"Y03FjkoEYrkp","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":34},"executionInfo":{"status":"ok","timestamp":1574090106301,"user_tz":-60,"elapsed":9141420,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"a3ea3745-3762-4618-ab29-13df1a3bc9b5"},"source":["###CHANGE CSV BEFORE RUNNING\n","start = time.perf_counter()\n","random_lemmas = corpus_extraction(random_htids, \"random\")\n","end = time.perf_counter()\n","print(end - start)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["9140.770264236002\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"xE34cj9xo7Ta","colab_type":"code","colab":{}},"source":["combined = dict()\n","combined[\"scifi\"] = scifi_lemmas\n","combined[\"random\"] = random_lemmas"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"colab_type":"text","id":"wJYNa6ansODN"},"source":["### 4. Creation of the dataframe and storing in a csv compressed file"]},{"cell_type":"code","metadata":{"id":"rL_EIUyZYrlG","colab_type":"code","colab":{}},"source":["with open(os.path.join(bigDir,\"termsdata.csv\"),'w', encoding='utf-8', newline='') as termsdata:\n","  writer = csv.writer(termsdata)\n","  writer.writerow((\"corpus\", \"year\", \"htid\", \"lem_word\", \"count\", \"occurs_100k\"))\n","  for corpus, years in combined.items():\n","    for year, volumes in years.items():\n","      for volume, lemmas in volumes.items():\n","        for lemma in lemmas:\n","          writer.writerow((corpus, year, volume, lemma[0], str(lemma[1]), str(lemma[2])))\n"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"FvCryEGSYrlQ","colab_type":"code","colab":{}},"source":["import gzip\n","import shutil\n","\n","with open('termsdata.csv', 'rb') as f_in:\n","    with gzip.open('termsdata.csv.gz', 'wb') as f_out:\n","        shutil.copyfileobj(f_in, f_out)"],"execution_count":null,"outputs":[]}]}